Load the Packages

library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(epiwraps)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: EnrichedHeatmap
## Loading required package: grid
## Loading required package: ComplexHeatmap
## ========================================
## ComplexHeatmap version 2.20.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ========================================
## EnrichedHeatmap version 1.34.0
## Bioconductor page: http://bioconductor.org/packages/EnrichedHeatmap/
## Github page: https://github.com/jokergoo/EnrichedHeatmap
## Documentation: http://bioconductor.org/packages/EnrichedHeatmap/
## 
## If you use it in published research, please cite:
## Gu, Z. EnrichedHeatmap: an R/Bioconductor package for comprehensive 
## visualization of genomic signal associations. BMC Genomics 2018.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(EnrichedHeatmap))
## ========================================
library(ggplot2)
library(rGREAT) # Gene Ontology enrichment among genomic regions
## 
## 
## ========================================
## rGREAT version 2.6.0
## Bioconductor page: http://bioconductor.org/packages/rGREAT/
## Github page: https://github.com/jokergoo/rGREAT
## 
## If you use it in published research, please cite:
## Gu, Z. rGREAT: an R/Bioconductor package for functional enrichment 
##   on genomic regions. Bioinformatics 2023.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(rGREAT))
## 
## Note: On Aug 19 2019 GREAT released version 4 where it supports `hg38`
## genome and removes some ontologies such pathways. `submitGreatJob()`
## still takes `hg19` as default. `hg38` can be specified by the `genome
## = 'hg38'` argument. To use the older versions such as 3.0.0, specify as
## `submitGreatJob(..., version = '3.0.0')`.
## 
## From rGREAT version 1.99.0, it also implements the GREAT algorithm and
## it allows to integrate GREAT analysis with the Bioconductor annotation
## ecosystem. By default it supports more than 500 organisms. Check the
## new function `great()` and the new vignette.
## ========================================

Download the Data

setwd("/Users/catherinerohner/ETH/Bioinformatics/Assignment10")
download.file("https://ethz-ins.org/content/w10.assignment.zip", "w10.assignment.zip")
unzip("w10.assignment.zip")
list.files()
## [1] "assignment.html"    "assignment.Rmd"     "Creb1.bed"         
## [4] "Creb1.bw"           "Creb3.bed"          "Creb3.bw"          
## [7] "Creb3L1.bed"        "Creb3L1.bw"         "w10.assignment.zip"

Filter and import high-confidence peaks to define regions for analysis. Load corresponding signal tracks.

peaks <- list.files(pattern="bed$")
# we first import the peaks
peaks <- lapply(peaks, rtracklayer::import.bed)
# we'll focus on the high-quality peaks
peaks <- lapply(peaks, FUN=function(x) x[x$score>800])
# we get the union of non-redundant regions
regions <- reduce(unlist(GRangesList(peaks)))
#l load tracks
tracks <- list.files(pattern="bw$")

Analyze Relationships Between CREB Family Transcription Factors

We utilize enriched heatmaps to examine the binding patterns of CREB1, CREB3, and CREB3L1 across the genomic regions. The initial plots reveal clear signals for all three transcription factors, suggesting strong binding activity. However, the complexity and overlap in data require further organization to better identify and understand the underlying patterns

ese <- signal2Matrix(tracks, regions, extend=2000)
## Reading Creb1.bw
## Reading Creb3.bw
## Reading Creb3L1.bw
plotEnrichedHeatmaps(ese, use_raster=FALSE)

## Find the appropriate number of clusters by analyzing the explained variance for each cluster count from 2 to 10.

The graph illustrates that k-values of 3, 4, and 5 are potential choices for clustering. Based on our discussions in class, it is often advantageous to start with a higher number of clusters (k=5) and then consolidate them if they appear redundant.

cl2 <- clusterSignalMatrices(ese, k=2:10)
ggplot(cl2$varExplained, aes(k, varExplained)) + geom_line()

Initial Clustering Analysis

We initially segmented the data into five clusters, which explained 74% of the variance. A comparison with four clusters, which accounted for 70% of the variance, shows minimal improvement in variance explanation when increasing to five clusters. This marginal gain suggests that clustering into four groups might be more appropriate. Additionally, clusters 2 and 3 in the five-cluster model contain notably fewer data points, which could indicate overfitting. This observation warrants further analysis to confirm the optimal cluster count.

set.seed(123)  # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(ese, k=5)
##   ~74% of the variance explained by clusters
table(cl)
## cl
##   1   2   3   4   5 
## 828 134 167 688 452
head(cl)
## [1] 5 4 1 1 4 4
## Levels: 1 2 3 4 5
length(cl)
## [1] 2269
length(regions)
## [1] 2269
rowData(ese)$cluster <- cl

Plotting Clustered Data with Enriched Heatmaps

Upon visualizing the data, Clusters 4 and 5 appear remarkably similar, underscoring our previous observations about diminishing returns when adding more clusters. In line with the strategy discussed in class to merge similar clusters for a clearer and more meaningful analysis, I will proceed by reducing the number of clusters to four.

plotEnrichedHeatmaps(ese, row_split="cluster", trim=0.99,
                     colors=c("white","darkred"),use_raster=FALSE)

## Detailed Clustering for k=4

The heatmap distinctly showcases four clusters, validating the decision to reduce the cluster count. Observations from the heatmap are as follows:

Cluster 1 shows the highest activity for Creb3L1, complemented by substantial signals for both Creb1 and Creb3, indicating a versatile regulatory region. Cluster 2 is characterized predominantly by high activity for Creb1, with moderate expression of Creb3L1 and minimal Creb3, suggesting a specific functional emphasis. Cluster 3 reveals a balanced high activity for both Creb1 and Creb3L1 but only a minimal presence of Creb3, reflecting a different aspect of regulatory interaction compared to other clusters. Cluster 4 displays a notably high signal for Creb3, distinguishing it significantly from the others.

cl <- clusterSignalMatrices(ese, k=4)
##   ~68% of the variance explained by clusters
rowData(ese)$cluster <- cl
table(cl)
## cl
##   1   2   3   4 
## 247 708 896 418
head(cl)
## [1] 3 3 2 2 2 3
## Levels: 1 2 3 4
mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black")
plotEnrichedHeatmaps(ese, row_split="cluster", mean_color=mycolors, colors=c("white","darkred"),use_raster=FALSE)

Visualizing the average signals across the clusters allows us to confirm specific characteristics:

Cluster 2 and 4 show distinct, strong signals for Creb1 and Creb3, respectively, highlighting their specialized activity profiles. Cluster 1 and Cluster 3 initially appear to have similar patterns. However, upon closer examination, it becomes evident that Cluster 3 has a significantly lower peak for Creb3, even though Creb1 and Creb3L1 are more pronounced compared to Cluster 1. This divergence suggests a unique regulatory pattern in Cluster 1 and 3, needing further investigation.***

d <- meltSignals(ese, splitBy=cl)
rowData(ese)$cluster <- cl
ggplot(d, aes(position, mean, colour=sample)) + geom_line(size=1.2) + facet_wrap(~split)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Enhancing Feature Comparison with Relative Scaling

To more accurately compare features across the same scale and identify more distinct patterns, we employ relative signal scaling. This adjustment reveals stark differences between the clusters that were not as apparent with absolute values:

Cluster 1 is characterized by very high peaks in Creb1, indicating a dominant presence of this transcription factor. Cluster 3, in contrast, shows high peaks in Creb3L1 with moderate signals for Creb1, highlighting a different regulatory emphasis. Interestingly, while previous analyses suggested that Cluster 1 had the highest signals for Creb3L1, relative scaling shows that Creb1 is actually more prominent.

cl <- clusterSignalMatrices(ese, k=4, scaleRows = TRUE)
##   ~86% of the variance explained by clusters
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line() + facet_wrap(~split)

mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black")
plotEnrichedHeatmaps(ese, row_split = cl,mean_color=mycolors, scale_rows = "global",use_raster=FALSE)

Get the description of the Cluster1.

Looking at the Values we can see that the enrichment analysis highlights a significant involvement of genomic regions in processes related to cell movement, including “locomotion”, “regulation of cell motility”, “regulation of cell migration” , “regulation of locomotion”, and “cell migration”. These processes show notable enrichment with genome fractions ranging from approximately 0.0977 to 0.166, and observed region hits from 139 to 206, indicating a strong representation within the analyzed genomic regions. Especially, “cell migration” and “anatomical structure morphogenesis” are among the most enriched processes, with 206 and 299 hits respectively, suggesting these areas as key functions regulated by the genomic regions in this cluster. Fold enrichment values range from about 1.348 to 1.718, underscoring a moderate to strong overrepresentation of these terms relative to the background. The p-values are highly significant, with adjusted p-values ranging from approximately 0.000912 to 0.009, reinforcing the statistical significance of the enrichment.

split_regions <- split(rowRanges(ese), rowData(ese)$cluster)

res <- great(split_regions[["1"]], gene_sets="GO:BP", tss_source="hg38", background=regions, cores=2)
## * TSS source: TxDb.
## * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.
## * gene ID type in the extended TSS is 'Entrez Gene ID'.
## * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
##     chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
##     chrY, chrM'.
## * 18100/30601 protein-coding genes left.
## * update seqinfo to the selected chromosomes.
## * TSS extension mode is 'basalPlusExt'.
## * construct the basal domains by extending 5000bp to upstream and 1000bp to downsteram of TSS.
## * calculate distances to neighbour regions.
## * extend to both sides until reaching the neighbour genes or to the maximal extension.
## * use GO:BP ontology with 15517 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 247 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2523 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp <- getEnrichmentTables(res)
head(bp)
##           id                                                description
## 1 GO:0000122  negative regulation of transcription by RNA polymerase II
## 2 GO:0007093                    mitotic cell cycle checkpoint signaling
## 3 GO:0044774                 mitotic DNA integrity checkpoint signaling
## 4 GO:0032609                              type II interferon production
## 5 GO:0032649                regulation of type II interferon production
## 6 GO:1901991 negative regulation of mitotic cell cycle phase transition
##   genome_fraction observed_region_hits fold_enrichment      p_value   p_adjust
## 1      0.11013942                   47        1.727659 0.0001416468 0.09809264
## 2      0.01566830                   13        3.359112 0.0001707444 0.09809264
## 3      0.01199743                   10        3.374541 0.0009185531 0.15549272
## 4      0.01946805                   13        2.703485 0.0012678699 0.15549272
## 5      0.01946805                   13        2.703485 0.0012678699 0.15549272
## 6      0.01962182                   13        2.682298 0.0013587790 0.15549272
##   mean_tss_dist observed_gene_hits gene_set_size fold_enrichment_hyper
## 1        128373                 21            70              1.911982
## 2          2097                  7            13              3.431762
## 3          2545                  5             8              3.983295
## 4         16466                  4            13              1.961007
## 5         16466                  4            13              1.961007
## 6          2097                  7            16              2.788306
##   p_value_hyper p_adjust_hyper
## 1   0.001445350      0.1223225
## 2   0.001572323      0.1223225
## 3   0.003398650      0.1223225
## 4   0.132968076      0.3840423
## 5   0.132968076      0.3840423
## 6   0.006863803      0.1359743

Use ggplot2 to visualize results from enrichment analysis

This visualization effectively highlights our key findings. The most significant results are represented by yellow dots, with the size of each dot correlating to the number of observed hits. Notably, terms like “cellular response to stimulus” and “cell communication” emerge as highly significant, also boasting a substantial number of region hits. This underscores the importance of these biological processes in the regions analyzed. Given that the signal in Cluster 1 was notably high for Creb1—a transcription factor known for its regulatory roles—these findings align well with our expectations and further emphasize the regulatory significance of these genomic regions.

ggplot(head(bp, 15), aes(fold_enrichment, reorder(description, p_adjust), size=observed_region_hits, color=-log10(p_adjust))) +
  geom_point() + scale_color_viridis_c()